{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.3"},"colab":{"name":"geodata_correction.ipynb","provenance":[]}},"cells":[{"cell_type":"markdown","metadata":{"id":"02vaJRPJ_D_g","colab_type":"text"},"source":["# Geodata correction\n","First, the old geodata is uploaded and the most frequent locations are retrieved. The locations that are too specific to be true, or that could be names wrongly considered locations by the NER algorithm are written into a CSV.\n","\n","Then, a function takes the incorrect locations from the CSV and drops or substitutes them.\n","\n","Some volumes are lost after the correction: total 15874 random volumes."]},{"cell_type":"code","metadata":{"id":"_ZVHrL60_D_l","colab_type":"code","colab":{}},"source":["%%time\n","#Create dataframe and print head for reference\n","import matplotlib.pyplot as plt\n","import pandas as pd\n","import numpy as np\n","import seaborn as sns\n","import scipy.stats as scs\n","import os\n","from csv import DictReader\n","\n","geo = pd.read_csv('old_geo.csv.gz')\n"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"scrolled":true,"id":"cOpp4VBR_D_q","colab_type":"code","colab":{}},"source":["#Create CSV of most frequent urban locations\n","urban.groupby([\"text_string\", \"formatted_address\"], as_index=False)[\"occurs_100k\"].sum().sort_values(by=\"occurs_100k\", ascending=False)[0:1000].to_csv(os.path.join(\".\", 'top_locations_0,1000.csv'))"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"xBZlleMn_D_w","colab_type":"code","colab":{}},"source":["#Substitute wrong locations\n","\n","def substitutions_list(file_path):\n","\n","    subst_list = []\n","    with open(file_path, 'r', encoding='utf-8') as csvfile:\n","        reader = DictReader(csvfile)\n","        for row in reader:\n","            subst_tuple = (row[\"wrong\"], row[\"correct\"])\n","            subst_list.append(subst_tuple)\n","            \n","        return subst_list\n","\n","\n","def substitution(geodata, subst_list):\n","\n","    for tup in subst_list:\n","        geodata.loc[geodata[\"formatted_address\"] == tup[0], [\"formatted_address\"]] = tup[1]\n","    \n","    return geodata\n"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"RODxGphp_D_0","colab_type":"code","colab":{}},"source":["s_list = substitutions_list(\"geo_subst.csv\")"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"jcSV5Flz_D_4","colab_type":"code","colab":{}},"source":["substitution(geo, s_list)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"bUASjNPT_D_9","colab_type":"code","colab":{}},"source":["#Deleting non urban locations (aka. personal names)\n","\n","def corrections_list(file_path):\n","\n","    corr_list = []\n","    with open(file_path, 'r', encoding='utf-8') as csvfile:\n","        reader = DictReader(csvfile)\n","        for row in reader:\n","            corr_list.append(row[\"drop\"])\n","            \n","        return corr_list\n","\n","def corrections_idx(geodata, corr_list):\n","    \n","    indexes = []\n","    for address in corr_list:\n","        idx = geodata.index[geodata[\"formatted_address\"] == address].tolist()\n","        indexes.extend(idx)\n","        \n","    return indexes"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"jq5BsSK8_D__","colab_type":"code","colab":{}},"source":["c_list = corrections_list(\"geo_drop.csv\")"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"jgga-Hhy_EAC","colab_type":"code","outputId":"1aba1116-a9c3-42bd-f0da-5c22952c5b1e","colab":{}},"source":["idx_list = corrections_idx(geo, c_list)\n","len(idx_list)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["50781"]},"metadata":{"tags":[]},"execution_count":15}]},{"cell_type":"code","metadata":{"id":"Hb00jtLI_EAG","colab_type":"code","outputId":"696f5500-23f5-4c6a-b70d-85ce4faa03ac","colab":{}},"source":["%%time\n","corr_geo = geo[np.bincount(idx_list, minlength=len(geo)) == 0]"],"execution_count":0,"outputs":[{"output_type":"stream","text":["CPU times: user 784 ms, sys: 656 ms, total: 1.44 s\n","Wall time: 1.46 s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"aiBOfDDX_EAJ","colab_type":"code","colab":{}},"source":["corr_geo.to_csv('corr_geo.csv.gz', compression='gzip')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"EHh6YHes_EAM","colab_type":"code","colab":{}},"source":["#Test substition\n","corr_geo.loc[geo[\"formatted_address\"] == \"Mars\"]"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"a7mNbIwq_EAO","colab_type":"code","outputId":"1bac3e1e-37bb-4fee-d21a-da64c2ab9447","colab":{}},"source":["#Test drop\n","corr_geo.loc[corr_geo[\"formatted_address\"] == \"Productos Santa Mónica, Av Costa de Oro 648, Centro, 91700 Veracruz, Ver., Mexico\"]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>htid</th>\n","      <th>text_string</th>\n","      <th>corpus</th>\n","      <th>occurs_raw</th>\n","      <th>occurs_100k</th>\n","      <th>title</th>\n","      <th>author</th>\n","      <th>year</th>\n","      <th>wordcount</th>\n","      <th>formatted_address</th>\n","      <th>...</th>\n","      <th>admin_1_std</th>\n","      <th>admin_2</th>\n","      <th>admin_3</th>\n","      <th>locality</th>\n","      <th>sublocality_1</th>\n","      <th>neighborhood</th>\n","      <th>route</th>\n","      <th>colloquial_area</th>\n","      <th>lat</th>\n","      <th>lon</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","  </tbody>\n","</table>\n","<p>0 rows × 24 columns</p>\n","</div>"],"text/plain":["Empty DataFrame\n","Columns: [htid, text_string, corpus, occurs_raw, occurs_100k, title, author, year, wordcount, formatted_address, location_type, country_long, country_short, admin_1_long, admin_1_std, admin_2, admin_3, locality, sublocality_1, neighborhood, route, colloquial_area, lat, lon]\n","Index: []\n","\n","[0 rows x 24 columns]"]},"metadata":{"tags":[]},"execution_count":19}]}]}